Stock porflolios: Graph-based approaches in stock market analysis
1.Import data and data processing¶
1.1 Import data
In [1]:
# creat and draw a graph (network)
import networkx as nx
import matplotlib.pyplot as plt
import plotly.express as px
# load data
import pandas as pd
import numpy as np
In [2]:
# Load datas
df_raw = pd.read_csv("CafeF.HSX.Upto13.11.2023.csv")
df_raw.columns = ["Ticker",'TradingDate', 'Open', 'High', 'Low', 'Close','Total Trade Quantity']
df_raw['TradingDate'] = pd.to_datetime(df_raw['TradingDate'],format='%Y%m%d')
df_stock = df_raw.pivot_table( index='TradingDate',columns='Ticker', values ='Close').loc["2022-01-01":"2023-11-13"]
df_stock = df_stock.dropna(subset=['VNAll-INDEX'])
df_stock =df_stock.dropna(axis=1)
# tách vnindex
vnindex= pd.DataFrame(index=df_stock.index, columns=["Close"], data = df_stock["VNAll-INDEX"].values)
# xóa cột vnallndex
data = df_stock.drop(["VNAll-INDEX"], axis=1)
data
Out[2]:
| Ticker | AAA | AAT | ABS | ACB | ACC | ACL | ADS | AGG | AGR | ANV | ... | VOS | VPB | VPG | VPH | VPI | VRE | VSC | VSH | VTO | YEG |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TradingDate | |||||||||||||||||||||
| 2022-01-04 | 20.70 | 15.5148 | 26.4241 | 22.9875 | 16.2744 | 16.9286 | 27.0721 | 36.7089 | 24.5350 | 31.8305 | ... | 19.05 | 22.9097 | 48.3272 | 12.9075 | 49.9372 | 31.15 | 34.9553 | 25.9606 | 11.2537 | 25.50 |
| 2022-01-06 | 22.40 | 15.5584 | 26.8029 | 22.5531 | 18.6018 | 17.5888 | 27.0343 | 37.6897 | 23.9096 | 33.0440 | ... | 18.50 | 22.3378 | 46.9730 | 14.7183 | 52.8800 | 35.60 | 34.3124 | 26.0056 | 11.3389 | 24.40 |
| 2022-01-07 | 22.80 | 15.1226 | 27.8921 | 22.2524 | 19.8864 | 17.2116 | 26.9209 | 38.1101 | 24.1502 | 32.9506 | ... | 19.00 | 21.9882 | 46.5499 | 15.7398 | 53.6825 | 34.75 | 34.7946 | 25.9606 | 11.3816 | 23.65 |
| 2022-01-10 | 21.25 | 14.6432 | 28.8866 | 22.0519 | 21.2660 | 17.2116 | 25.0682 | 39.0908 | 23.0918 | 31.2238 | ... | 18.40 | 21.8611 | 44.8571 | 16.8077 | 52.6124 | 35.00 | 34.3124 | 25.7803 | 11.0832 | 22.00 |
| 2022-01-11 | 20.00 | 14.4689 | 29.1233 | 21.8849 | 22.7408 | 17.3059 | 23.7070 | 41.8230 | 22.7069 | 29.8703 | ... | 18.00 | 21.7976 | 45.0264 | 17.9684 | 51.8099 | 34.20 | 33.5089 | 25.8704 | 10.6569 | 20.50 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2023-11-07 | 8.72 | 4.7200 | 5.2700 | 22.1000 | 11.0000 | 12.4000 | 12.8500 | 25.0000 | 13.5500 | 27.7500 | ... | 9.46 | 19.7312 | 15.3000 | 7.4500 | 54.0000 | 23.15 | 24.4000 | 43.3500 | 9.0100 | 16.75 |
| 2023-11-08 | 9.29 | 4.8600 | 5.4500 | 22.8000 | 11.1500 | 12.5500 | 13.4000 | 25.9000 | 14.4500 | 29.6500 | ... | 10.05 | 20.3508 | 16.3500 | 7.9700 | 55.0000 | 23.70 | 25.0000 | 43.1000 | 9.1500 | 17.40 |
| 2023-11-09 | 9.21 | 4.9100 | 5.6600 | 22.5500 | 11.2000 | 12.6500 | 13.8000 | 26.7000 | 14.9500 | 29.6500 | ... | 10.05 | 20.0000 | 16.7000 | 8.2500 | 56.0000 | 24.30 | 26.7500 | 43.6000 | 9.0200 | 17.20 |
| 2023-11-10 | 9.10 | 4.8500 | 5.6000 | 22.4000 | 11.3000 | 12.5000 | 13.7000 | 26.6000 | 14.9000 | 29.0000 | ... | 9.90 | 19.5500 | 16.2500 | 8.2500 | 56.7000 | 23.95 | 26.6500 | 43.5500 | 9.0500 | 16.50 |
| 2023-11-13 | 9.12 | 4.8500 | 5.5200 | 22.4500 | 11.3000 | 12.4000 | 13.5000 | 26.0000 | 14.8000 | 28.9500 | ... | 9.85 | 19.2000 | 16.7500 | 8.1800 | 56.4000 | 23.50 | 27.0500 | 43.7000 | 9.0500 | 15.70 |
454 rows × 253 columns
1.2.Train and test data
In [3]:
date = '2023-10-13'
data_train = data.loc['2022-01-01':date]
data_test = data.loc[date:'2023-11-13']
vnindex_train = vnindex.loc['2022-01-01':date]
vnindex_test = vnindex.loc[date:'2023-11-13' ]
In [4]:
print("Train:",data_train.shape)
print("Test:",data_test.shape)
Train: (433, 253) Test: (22, 253)
1.3.Correlation matrix
In [5]:
# Correlation matrix
corr = data_train.corr()
corr
Out[5]:
| Ticker | AAA | AAT | ABS | ACB | ACC | ACL | ADS | AGG | AGR | ANV | ... | VOS | VPB | VPG | VPH | VPI | VRE | VSC | VSH | VTO | YEG |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Ticker | |||||||||||||||||||||
| AAA | 1.000000 | 0.800995 | 0.925286 | 0.621155 | 0.843179 | 0.640173 | 0.894249 | 0.837729 | 0.855387 | 0.360453 | ... | 0.833451 | 0.833812 | 0.873860 | 0.917101 | 0.046521 | 0.783166 | 0.564215 | -0.306034 | 0.809277 | 0.780926 |
| AAT | 0.800995 | 1.000000 | 0.872650 | 0.218903 | 0.767929 | 0.837582 | 0.824752 | 0.820021 | 0.557446 | 0.560059 | ... | 0.859875 | 0.568635 | 0.909345 | 0.769936 | 0.324882 | 0.549126 | 0.797596 | -0.442349 | 0.456218 | 0.869552 |
| ABS | 0.925286 | 0.872650 | 1.000000 | 0.376528 | 0.889446 | 0.649128 | 0.858637 | 0.933118 | 0.734863 | 0.274155 | ... | 0.833286 | 0.713374 | 0.931276 | 0.898980 | 0.205104 | 0.754311 | 0.631374 | -0.559246 | 0.684377 | 0.810823 |
| ACB | 0.621155 | 0.218903 | 0.376528 | 1.000000 | 0.340917 | 0.244736 | 0.545067 | 0.334682 | 0.773024 | 0.223521 | ... | 0.404994 | 0.792058 | 0.371723 | 0.591299 | -0.396326 | 0.621569 | 0.131757 | 0.266102 | 0.747304 | 0.271166 |
| ACC | 0.843179 | 0.767929 | 0.889446 | 0.340917 | 1.000000 | 0.502891 | 0.711758 | 0.801236 | 0.634632 | 0.249388 | ... | 0.743277 | 0.621205 | 0.777770 | 0.794516 | 0.256396 | 0.622726 | 0.456402 | -0.454469 | 0.630557 | 0.753552 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| VRE | 0.783166 | 0.549126 | 0.754311 | 0.621569 | 0.622726 | 0.408428 | 0.689090 | 0.701112 | 0.695075 | 0.097110 | ... | 0.554348 | 0.780027 | 0.682744 | 0.738304 | -0.026932 | 1.000000 | 0.344394 | -0.380215 | 0.715964 | 0.474823 |
| VSC | 0.564215 | 0.797596 | 0.631374 | 0.131757 | 0.456402 | 0.867379 | 0.741603 | 0.678828 | 0.389239 | 0.641634 | ... | 0.720799 | 0.411365 | 0.772823 | 0.566335 | 0.252062 | 0.344394 | 1.000000 | -0.207870 | 0.326632 | 0.741112 |
| VSH | -0.306034 | -0.442349 | -0.559246 | 0.266102 | -0.454469 | -0.184940 | -0.318366 | -0.528836 | -0.065181 | 0.325275 | ... | -0.226182 | -0.118665 | -0.510829 | -0.277537 | -0.187729 | -0.380215 | -0.207870 | 1.000000 | -0.147014 | -0.218009 |
| VTO | 0.809277 | 0.456218 | 0.684377 | 0.747304 | 0.630557 | 0.420525 | 0.805536 | 0.665324 | 0.872062 | 0.152037 | ... | 0.595179 | 0.890941 | 0.671544 | 0.791985 | -0.199332 | 0.715964 | 0.326632 | -0.147014 | 1.000000 | 0.467352 |
| YEG | 0.780926 | 0.869552 | 0.810823 | 0.271166 | 0.753552 | 0.801225 | 0.725785 | 0.805502 | 0.547680 | 0.676958 | ... | 0.942562 | 0.553524 | 0.801552 | 0.747124 | 0.425644 | 0.474823 | 0.741112 | -0.218009 | 0.467352 | 1.000000 |
253 rows × 253 columns
1.4. Distance matrix and graph
In [6]:
#The node size is proportional to the degree
def get_opption(degrees, scale = 200):
nodes_size = [10+item*scale for item in degrees]
nodes_color = [item for item in degrees]
options = {
"font_size": 20,
"node_size": nodes_size,
"node_color": nodes_color,
"edgecolors": "black",
"linewidths": 0.2,
"width": 0.4,
"alpha":0.6
}
return options
In [7]:
# Tính khoảng cách
distance_matrix = np.sqrt(2*(1-corr))
distance_matrix
Out[7]:
| Ticker | AAA | AAT | ABS | ACB | ACC | ACL | ADS | AGG | AGR | ANV | ... | VOS | VPB | VPG | VPH | VPI | VRE | VSC | VSH | VTO | YEG |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Ticker | |||||||||||||||||||||
| AAA | 0.000000 | 0.630880 | 0.386559 | 0.870454 | 0.560037 | 0.848324 | 0.459893 | 0.569685 | 0.537798 | 1.130971 | ... | 0.577147 | 0.576520 | 0.502275 | 0.407182 | 1.380927 | 0.658534 | 0.933580 | 1.616190 | 0.617613 | 0.661928 |
| AAT | 0.630880 | 0.000000 | 0.504677 | 1.249878 | 0.681279 | 0.569944 | 0.592026 | 0.599965 | 0.940802 | 0.938020 | ... | 0.529386 | 0.928832 | 0.425804 | 0.678328 | 1.161996 | 0.949604 | 0.636246 | 1.698440 | 1.042863 | 0.510780 |
| ABS | 0.386559 | 0.504677 | 0.000000 | 1.116666 | 0.470222 | 0.837702 | 0.531719 | 0.365739 | 0.728199 | 1.204861 | ... | 0.577432 | 0.757134 | 0.370739 | 0.449488 | 1.260869 | 0.700983 | 0.858634 | 1.765925 | 0.794510 | 0.615105 |
| ACB | 0.870454 | 1.249878 | 1.116666 | 0.000000 | 1.148114 | 1.229035 | 0.953869 | 1.153532 | 0.673760 | 1.246178 | ... | 1.090876 | 0.644890 | 1.120962 | 0.904102 | 1.671123 | 0.869979 | 1.317758 | 1.211526 | 0.710909 | 1.207339 |
| ACC | 0.560037 | 0.681279 | 0.470222 | 1.148114 | 0.000000 | 0.997104 | 0.759265 | 0.630499 | 0.854831 | 1.225245 | ... | 0.716551 | 0.870397 | 0.666679 | 0.641068 | 1.219511 | 0.868647 | 1.042687 | 1.705561 | 0.859585 | 0.702066 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| VRE | 0.658534 | 0.949604 | 0.700983 | 0.869979 | 0.868647 | 1.087724 | 0.788555 | 0.773160 | 0.780929 | 1.343793 | ... | 0.944089 | 0.663285 | 0.796563 | 0.723458 | 1.433131 | 0.000000 | 1.145081 | 1.661454 | 0.753705 | 1.024867 |
| VSC | 0.933580 | 0.636246 | 0.858634 | 1.317758 | 1.042687 | 0.515017 | 0.718883 | 0.801464 | 1.105225 | 0.846600 | ... | 0.747263 | 1.085020 | 0.674058 | 0.931306 | 1.223060 | 1.145081 | 0.000000 | 1.554265 | 1.160490 | 0.719566 |
| VSH | 1.616190 | 1.698440 | 1.765925 | 1.211526 | 1.705561 | 1.539442 | 1.623802 | 1.748620 | 1.459576 | 1.161658 | ... | 1.566002 | 1.495771 | 1.738292 | 1.598460 | 1.541252 | 1.661454 | 1.554265 | 0.000000 | 1.514605 | 1.560775 |
| VTO | 0.617613 | 1.042863 | 0.794510 | 0.710909 | 0.859585 | 1.076546 | 0.623642 | 0.818140 | 0.505842 | 1.302277 | ... | 0.899801 | 0.467030 | 0.810501 | 0.645004 | 1.548762 | 0.753705 | 1.160490 | 1.514605 | 0.000000 | 1.032132 |
| YEG | 0.661928 | 0.510780 | 0.615105 | 1.207339 | 0.702066 | 0.630515 | 0.740561 | 0.623695 | 0.951126 | 0.803794 | ... | 0.338934 | 0.944961 | 0.629997 | 0.711163 | 1.071780 | 1.024867 | 0.719566 | 1.560775 | 1.032132 | 0.000000 |
253 rows × 253 columns
In [8]:
# crete graph from matrix
d = distance_matrix
G_stock = nx.from_pandas_adjacency(d)
print(G_stock)
Graph with 253 nodes and 31878 edges
In [9]:
#draw the undirected Graph
figure = plt.figure(figsize=(22, 10))
plt.title("Stock")
nx.draw_networkx(G_stock, with_labels=False)
plt.show()
2. Filtering¶
2.1.Minimum spanning tree
In [10]:
T = nx.minimum_spanning_tree(G_stock)
In [11]:
plt.figure(figsize=(20,15))
plt.title("Stock")
cmap=plt.cm.BrBG
#draw the undirected Graph
pos= nx.spring_layout(T) #Adjust the distance of nodes
nx.draw_networkx(T,
pos=pos,
with_labels=True,
node_color='lightgreen',
node_size=600, edge_color='black',
linewidths=1, font_size=8, alpha =1)
plt.show();
2.2 Centrality measures and analysis
In [12]:
degree_centrality = nx.degree_centrality(T)
closeness_centrality = nx.closeness_centrality(T)
betweenness_centrality = nx.betweenness_centrality(T)
eigenvector_centrality=nx.eigenvector_centrality_numpy(T)
2.2.1 Degree Centrality
In [13]:
keys = []
values = []
for key, value in degree_centrality.items():
keys.append(key)
values.append(value)
dc_data = pd.DataFrame({'stocks': keys, 'degree_centrality': values}).sort_values('degree_centrality', ascending=False)
px.bar(data_frame=dc_data, x='stocks', y='degree_centrality', template='plotly_dark')
2.2.2. Closeness Centrality
In [14]:
keys = []
values = []
for key, value in closeness_centrality.items():
keys.append(key)
values.append(value)
cc_data = pd.DataFrame({'stocks': keys, 'closeness_centrality': values}).sort_values('closeness_centrality',
ascending=False)
px.bar(data_frame=cc_data, x='stocks', y='closeness_centrality', template='plotly_dark')
2.2.3.Betweeness Centrality
In [15]:
keys = []
values = []
for key, value in betweenness_centrality.items():
keys.append(key)
values.append(value)
bc_data = pd.DataFrame({'stocks': keys, 'betweenness_centrality': values}).sort_values('betweenness_centrality',
ascending=False)
px.bar(data_frame=bc_data, x='stocks', y='betweenness_centrality', template='plotly_dark')
2.2.4.Eigenvector Centrality
In [18]:
keys = []
values = []
for key, value in eigenvector_centrality.items():
keys.append(key)
values.append(value)
bc_data = pd.DataFrame({'stocks': keys, 'eigenvector_centrality': values}).sort_values('eigenvector_centrality',
ascending=False)
px.bar(data_frame=bc_data, x='stocks', y='eigenvector_centrality', template='plotly_dark')
3. Modeling: Hierachical clustering¶
3.1. The distance between two stocks
In [211]:
# Tính toán ma trận khoảng cách ngắn nhất
shortest_paths = dict(nx.all_pairs_dijkstra_path_length(T))
# Chuyển kết quả thành DataFrame
df_path = pd.DataFrame(shortest_paths)
df_path
Out[211]:
| AAA | AAT | ABS | ACB | ACC | ACL | ADS | AGG | AGR | ANV | ... | VOS | VPB | VPG | VPH | VPI | VRE | VSC | VSH | VTO | YEG | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAA | 0.000000 | 2.037785 | 0.899128 | 5.397939 | 1.369350 | 1.854991 | 2.347372 | 1.696214 | 3.201172 | 2.697912 | ... | 2.820441 | 3.402450 | 1.007982 | 2.146925 | 3.482404 | 2.144981 | 2.370008 | 7.065757 | 3.107685 | 3.159375 |
| HQC | 0.234630 | 2.272415 | 1.133758 | 5.632570 | 1.603980 | 2.089622 | 2.582002 | 1.930845 | 3.435803 | 2.932543 | ... | 3.055071 | 3.637080 | 1.242612 | 2.381555 | 3.717035 | 2.379612 | 2.604638 | 7.300387 | 3.342315 | 3.394005 |
| FIT | 0.251560 | 1.786225 | 0.647568 | 5.146380 | 1.117790 | 1.603432 | 2.095812 | 1.444655 | 2.949613 | 2.446353 | ... | 2.568881 | 3.150890 | 0.756422 | 1.895365 | 3.230845 | 1.893422 | 2.118448 | 6.814197 | 2.856125 | 2.907815 |
| POW | 0.376193 | 2.413977 | 1.275321 | 5.774132 | 1.745542 | 2.231184 | 2.723564 | 2.072407 | 3.577365 | 3.074105 | ... | 3.196633 | 3.778643 | 1.384175 | 2.523118 | 3.858597 | 2.521174 | 2.746200 | 7.441949 | 3.483877 | 3.535568 |
| KMR | 0.425435 | 1.612350 | 0.473693 | 4.972505 | 0.943915 | 1.429557 | 1.921937 | 1.270780 | 2.775738 | 2.272478 | ... | 2.395006 | 2.977015 | 0.582547 | 1.721490 | 3.056970 | 1.719547 | 1.944573 | 6.640322 | 2.682250 | 2.733940 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| CTF | 8.923017 | 9.173815 | 8.325346 | 7.344165 | 8.795568 | 9.602731 | 9.773590 | 8.832245 | 5.721845 | 10.445652 | ... | 9.345624 | 8.442347 | 8.755721 | 8.672109 | 10.618435 | 8.368894 | 10.117747 | 1.857261 | 7.122266 | 9.684559 |
| FIR | 9.165357 | 9.416155 | 8.567686 | 7.586505 | 9.037908 | 9.845071 | 10.015930 | 9.074585 | 5.964185 | 10.687992 | ... | 9.587965 | 8.684687 | 8.998061 | 8.914449 | 10.860775 | 8.611234 | 10.360087 | 2.099601 | 7.364606 | 9.926899 |
| KOS | 9.561251 | 9.812049 | 8.963580 | 7.982399 | 9.433802 | 10.240965 | 10.411824 | 9.470479 | 6.360079 | 11.083886 | ... | 9.983859 | 9.080581 | 9.393955 | 9.310343 | 11.256669 | 9.007128 | 10.755981 | 2.495495 | 7.760500 | 10.322793 |
| SAB | 10.018278 | 10.269075 | 9.420607 | 8.439425 | 9.890828 | 10.697991 | 10.868850 | 9.927505 | 6.817105 | 11.540912 | ... | 10.440885 | 9.537607 | 9.850982 | 9.767369 | 11.713695 | 9.464155 | 11.213008 | 2.952521 | 8.217526 | 10.779819 |
| BCM | 10.820987 | 11.071785 | 10.223316 | 9.242134 | 10.693538 | 11.500700 | 11.671559 | 10.730214 | 7.619815 | 12.343621 | ... | 11.243594 | 10.340316 | 10.653691 | 10.570079 | 12.516404 | 10.266864 | 12.015717 | 3.755230 | 9.020235 | 11.582528 |
253 rows × 253 columns
3.2 Hierachical clustering
In [212]:
import scipy.cluster.hierarchy as shc
plt.figure(figsize=(18, 7))
plt.title("Dendrograms")
dend = shc.dendrogram(shc.linkage(df_path, method='ward'), color_threshold=-np.inf)
plt.axhline(y=-np.inf, c='blue');
In [213]:
plt.figure(figsize=(18, 7))
plt.title("Dendrograms")
dend = shc.dendrogram(shc.linkage(df_path, method='ward'), color_threshold=-np.inf)
plt.axhline(y=-np.inf, c='blue')
plt.axhline(y=140, color='black', linestyle='--');
In [214]:
from sklearn.cluster import AgglomerativeClustering
clusters = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
labels = clusters.fit_predict(df_path)
labels
Out[214]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 3, 0, 0, 0, 0, 3, 0, 0,
0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0,
3, 0, 4, 0, 0, 0, 3, 0, 3, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3,
3, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 3, 3,
4, 4, 4, 4, 4, 0, 0, 0, 3, 0, 0, 0, 3, 3, 3, 0, 4, 4, 4, 4, 4, 4,
0, 3, 4, 3, 0, 0, 0, 0, 4, 4, 4, 3, 4, 4, 4, 3, 3, 4, 3, 4, 3, 3,
4, 4, 4, 3, 4, 4, 4, 1, 4, 4, 1, 1, 4, 1, 1, 1, 1, 3, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [215]:
import seaborn as sns
ax=sns.countplot(x=labels)
ax.bar_label(ax.containers[0]);
In [216]:
plt.figure(figsize=(22,15))
# Vẽ đồ thị và tô màu các cụm
pos = nx.spring_layout(T)
nx.draw_networkx(T,
pos=pos,
with_labels=True,
node_color=labels,
cmap='Set1',
node_size=600, edge_color='black',
linewidths=1, font_size=8, alpha =1 )
plt.show()
4. Stock Portfolio¶
In [217]:
# Tính returns
def get_returns(df):
return np.log(df/df.shift(1)).dropna()
def get_clusters(data,label):
stocks = data.columns
return [stocks[label == name] for name in np.unique(label)]
def get_portflolio(cluster,returns, k=2):
# mỗi cụm lấy k asset có sharpe lớn nhất
portflolio = []
for clus in cluster:
portflolio += list((sharpe_ratio(returns[clus])).nlargest(k).index)
return portflolio
risk_free_rate = 0.03/365
def sharpe_ratio(returns,rf =risk_free_rate):
return (returns.mean()-rf) / (returns.std() * np.sqrt(252))
In [218]:
# chia các cổ phiếu theo cụm đã tính toán ở mục 3
clusters = get_clusters(data_train,labels)
clusters
Out[218]:
[Index(['AAA', 'AAT', 'ABS', 'ACB', 'ACC', 'ACL', 'ADS', 'AGG', 'AGR', 'ANV',
...
'PHC', 'PLP', 'PLX', 'PNJ', 'PSH', 'QCG', 'SAM', 'SBT', 'SBV', 'SCR'],
dtype='object', name='Ticker', length=138),
Index(['TCT', 'TDH', 'TDM', 'THG', 'TIP', 'TLD', 'TLG', 'TN1', 'TNA', 'TNH',
'TNI', 'TNT', 'TPB', 'TSC', 'TTA', 'TTF', 'TV2', 'TVB', 'TVS', 'VCB',
'VCG', 'VCI', 'VDS', 'VGC', 'VHC', 'VHM', 'VIB', 'VIC', 'VIP', 'VND'],
dtype='object', name='Ticker'),
Index(['VIX', 'VJC', 'VNE', 'VNM', 'VOS', 'VPB', 'VPG', 'VPH', 'VPI', 'VRE',
'VSC', 'VSH', 'VTO', 'YEG'],
dtype='object', name='Ticker'),
Index(['CHP', 'DGW', 'DLG', 'DRH', 'FIT', 'FMC', 'FPT', 'FTS', 'GIL', 'HAX',
'HCM', 'HDG', 'HHP', 'HHS', 'HT1', 'ICT', 'IJC', 'ITA', 'KHG', 'KSB',
'LCG', 'LPB', 'NHA', 'NT2', 'NTL', 'NVL', 'PHR', 'POM', 'POW', 'PPC',
'RAL', 'SAB', 'SHB', 'SKG', 'SMC', 'SSI', 'STK', 'SVD', 'TCH', 'TLH'],
dtype='object', name='Ticker'),
Index(['HUB', 'MBB', 'OCB', 'OGC', 'ORS', 'PAN', 'PC1', 'PTB', 'PTC', 'PTL',
'PVD', 'PVT', 'QBS', 'REE', 'SCS', 'SGR', 'SHA', 'SHI', 'SJD', 'SJS',
'SSB', 'STB', 'SZC', 'TCB', 'TCD', 'TCL', 'TCM', 'TCO', 'TDC', 'TDG',
'TEG'],
dtype='object', name='Ticker')]
In [219]:
returns_train = get_returns(data_train)
# Chọn danh mục đầu tư cổ phiếu (mỗi cluster chọn 2 cổ phiếu)
portfolios = get_portflolio(clusters,returns_train)
portfolios
Out[219]:
['BMP', 'CTF', 'VCB', 'TDM', 'VSH', 'VPI', 'FPT', 'CHP', 'TCL', 'REE']
5. Testing¶
5.1 Train dataset
In [230]:
!pip install PyPortfoliOopt
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns
Requirement already satisfied: PyPortfoliOopt in /usr/local/lib/python3.10/dist-packages (1.5.5) Requirement already satisfied: cvxpy<2.0.0,>=1.1.19 in /usr/local/lib/python3.10/dist-packages (from PyPortfoliOopt) (1.3.2) Requirement already satisfied: numpy<2.0.0,>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from PyPortfoliOopt) (1.23.5) Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.10/dist-packages (from PyPortfoliOopt) (1.5.3) Requirement already satisfied: scipy<2.0,>=1.3 in /usr/local/lib/python3.10/dist-packages (from PyPortfoliOopt) (1.11.3) Requirement already satisfied: osqp>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from cvxpy<2.0.0,>=1.1.19->PyPortfoliOopt) (0.6.2.post8) Requirement already satisfied: ecos>=2 in /usr/local/lib/python3.10/dist-packages (from cvxpy<2.0.0,>=1.1.19->PyPortfoliOopt) (2.0.12) Requirement already satisfied: scs>=1.1.6 in /usr/local/lib/python3.10/dist-packages (from cvxpy<2.0.0,>=1.1.19->PyPortfoliOopt) (3.2.4) Requirement already satisfied: setuptools>65.5.1 in /usr/local/lib/python3.10/dist-packages (from cvxpy<2.0.0,>=1.1.19->PyPortfoliOopt) (67.7.2) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.19->PyPortfoliOopt) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.19->PyPortfoliOopt) (2023.3.post1) Requirement already satisfied: qdldl in /usr/local/lib/python3.10/dist-packages (from osqp>=0.4.1->cvxpy<2.0.0,>=1.1.19->PyPortfoliOopt) (0.1.7.post0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas>=0.19->PyPortfoliOopt) (1.16.0)
- Based on Markowitz Mean-Variance Optimization Model
In [231]:
New_portfolio = data_train[portfolios]
New_portfolio
mu = expected_returns.mean_historical_return(New_portfolio) #expected returns
S = risk_models.sample_cov(New_portfolio)
# Optimizing for maximal Sharpe ratio
ef = EfficientFrontier(mu, S) # Providing expected returns and covariance matrix as input
weights = ef.max_sharpe() # Optimizing weights for Sharpe ratio maximization
clean_weights = ef.clean_weights() # clean_weights rounds the weights and clips near-zeros
# Printing optimized weights and expected performance for portfolio
clean_weights
Out[231]:
OrderedDict([('BMP', 0.25091),
('CTF', 0.33426),
('VCB', 0.09253),
('TDM', 0.0),
('VSH', 0.09441),
('VPI', 0.0),
('FPT', 0.10133),
('CHP', 0.12657),
('TCL', 0.0),
('REE', 0.0)])
In [232]:
# Creating new portfolio with optimized weights
w = list(clean_weights.values())
w
# Visualizing daily returns
returns_port_train =pd.DataFrame(columns=["portfolios"],
data=returns_train[portfolios]@w)
returns_port_train
Out[232]:
| portfolios | |
|---|---|
| TradingDate | |
| 2022-01-06 | -0.004318 |
| 2022-01-07 | -0.007422 |
| 2022-01-10 | -0.002304 |
| 2022-01-11 | -0.008481 |
| 2022-01-12 | -0.001866 |
| ... | ... |
| 2023-10-09 | 0.003134 |
| 2023-10-10 | 0.006798 |
| 2023-10-11 | 0.009742 |
| 2023-10-12 | 0.006917 |
| 2023-10-13 | 0.000229 |
432 rows × 1 columns
In [223]:
returns_test = get_returns(data_test)
return_vn_train = get_returns(vnindex_train)
return_vn_test = get_returns(vnindex_test)
return_vn_train.columns = ["vnindex"]
return_vn_test.columns = ["vnindex"]
In [224]:
returns_port_train =pd.DataFrame(columns=["portpolios"],
data=returns_train[portfolios]@w)
# so sánh với returns của vnindex
pd.concat([returns_port_train,return_vn_train],axis=1).cumsum().plot(figsize = (20,11),
title = "Compare cumulative returns");
5.2. Test Dataset
In [227]:
returns_port_test =pd.DataFrame(columns=["portpolios"],
data=returns_test[portfolios]@w)
# so sánh với returns của vnindex
pd.concat([returns_port_test,return_vn_test],axis=1).cumsum().plot(figsize = (20,8),
title = "Compare cumulative returns");